/*
 * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
 * SPDX-License-Identifier: MIT
 */

#include "radeon_compiler.h"

#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "pipe/p_state.h"
#include "util/u_debug.h"
#include "radeon_compiler_util.h"
#include "radeon_dataflow.h"
#include "radeon_program.h"
#include "radeon_program_pair.h"
#include "radeon_regalloc.h"

void
rc_init(struct radeon_compiler *c, const struct rc_regalloc_state *rs)
{
   memset(c, 0, sizeof(*c));

   memory_pool_init(&c->Pool);
   c->Program.Instructions.Prev = &c->Program.Instructions;
   c->Program.Instructions.Next = &c->Program.Instructions;
   c->Program.Instructions.U.I.Opcode = RC_OPCODE_ILLEGAL_OPCODE;
   c->regalloc_state = rs;
   c->max_temp_index = -1;
}

void
rc_destroy(struct radeon_compiler *c)
{
   rc_constants_destroy(&c->Program.Constants);
   memory_pool_destroy(&c->Pool);
   free(c->ErrorMsg);
}

void
rc_debug(struct radeon_compiler *c, const char *fmt, ...)
{
   va_list ap;

   if (!(c->Debug & RC_DBG_LOG))
      return;

   va_start(ap, fmt);
   vfprintf(stderr, fmt, ap);
   va_end(ap);
}

void
rc_error(struct radeon_compiler *c, const char *fmt, ...)
{
   va_list ap;

   c->Error = 1;

   if (!c->ErrorMsg) {
      /* Only remember the first error */
      char buf[1024];
      int written;

      va_start(ap, fmt);
      written = vsnprintf(buf, sizeof(buf), fmt, ap);
      va_end(ap);

      if (written < sizeof(buf)) {
         c->ErrorMsg = strdup(buf);
      } else {
         c->ErrorMsg = malloc(written + 1);

         va_start(ap, fmt);
         vsnprintf(c->ErrorMsg, written + 1, fmt, ap);
         va_end(ap);
      }
   }

   if (c->Debug & RC_DBG_LOG) {
      fprintf(stderr, "r300compiler error: ");

      va_start(ap, fmt);
      vfprintf(stderr, fmt, ap);
      va_end(ap);
   }
}

int
rc_if_fail_helper(struct radeon_compiler *c, const char *file, int line, const char *assertion)
{
   rc_error(c, "ICE at %s:%i: assertion failed: %s\n", file, line, assertion);
   return 1;
}

void
rc_mark_unused_channels(struct radeon_compiler *c, void *user)
{
   unsigned int srcmasks[3];

   for (struct rc_instruction *inst = c->Program.Instructions.Next;
        inst != &c->Program.Instructions; inst = inst->Next) {

      rc_compute_sources_for_writemask(inst, inst->U.I.DstReg.WriteMask, srcmasks);

      for (unsigned int src = 0; src < 3; ++src) {
         for (unsigned int chan = 0; chan < 4; ++chan) {
            if (!GET_BIT(srcmasks[src], chan))
               SET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan, RC_SWIZZLE_UNUSED);
         }
      }
   }
}

/**
 * Recompute c->Program.InputsRead and c->Program.OutputsWritten
 * based on which inputs and outputs are actually referenced
 * in program instructions.
 */
void
rc_calculate_inputs_outputs(struct radeon_compiler *c)
{
   struct rc_instruction *inst;

   c->Program.InputsRead = 0;
   c->Program.OutputsWritten = 0;

   for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
      const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
      int i;

      for (i = 0; i < opcode->NumSrcRegs; ++i) {
         if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT)
            c->Program.InputsRead |= 1U << inst->U.I.SrcReg[i].Index;
      }

      if (opcode->HasDstReg) {
         if (inst->U.I.DstReg.File == RC_FILE_OUTPUT)
            c->Program.OutputsWritten |= 1U << inst->U.I.DstReg.Index;
      }
   }
}

/**
 * Rewrite the program such that a given output is duplicated.
 */
void
rc_copy_output(struct radeon_compiler *c, unsigned output, unsigned dup_output)
{
   unsigned tempreg = rc_find_free_temporary(c);
   struct rc_instruction *inst;
   struct rc_instruction *insert_pos = c->Program.Instructions.Prev;
   struct rc_instruction *last_write_inst = NULL;
   unsigned branch_depth = 0;
   unsigned loop_depth = 0;
   bool emit_after_control_flow = false;
   unsigned num_writes = 0;

   for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
      const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);

      if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP)
         loop_depth++;
      if (inst->U.I.Opcode == RC_OPCODE_IF)
         branch_depth++;
      if ((inst->U.I.Opcode == RC_OPCODE_ENDLOOP && loop_depth--) ||
          (inst->U.I.Opcode == RC_OPCODE_ENDIF && branch_depth--))
         if (emit_after_control_flow && loop_depth == 0 && branch_depth == 0) {
            insert_pos = inst;
            emit_after_control_flow = false;
         }

      if (opcode->HasDstReg) {
         if (inst->U.I.DstReg.File == RC_FILE_OUTPUT && inst->U.I.DstReg.Index == output) {
            num_writes++;
            inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
            inst->U.I.DstReg.Index = tempreg;
            insert_pos = inst;
            last_write_inst = inst;
            if (loop_depth != 0 && branch_depth != 0)
               emit_after_control_flow = true;
         }
      }
   }

   /* If there is only a single write, just duplicate the whole instruction instead.
    * We can do this even when the single write was is a control flow.
    */
   if (num_writes == 1) {
      last_write_inst->U.I.DstReg.File = RC_FILE_OUTPUT;
      last_write_inst->U.I.DstReg.Index = output;

      inst = rc_insert_new_instruction(c, last_write_inst);
      struct rc_instruction *prev = inst->Prev;
      struct rc_instruction *next = inst->Next;
      memcpy(inst, last_write_inst, sizeof(struct rc_instruction));
      inst->Prev = prev;
      inst->Next = next;
      inst->U.I.DstReg.Index = dup_output;
   } else {
      inst = rc_insert_new_instruction(c, insert_pos);
      inst->U.I.Opcode = RC_OPCODE_MOV;
      inst->U.I.DstReg.File = RC_FILE_OUTPUT;
      inst->U.I.DstReg.Index = output;

      inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
      inst->U.I.SrcReg[0].Index = tempreg;
      inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;

      inst = rc_insert_new_instruction(c, inst);
      inst->U.I.Opcode = RC_OPCODE_MOV;
      inst->U.I.DstReg.File = RC_FILE_OUTPUT;
      inst->U.I.DstReg.Index = dup_output;

      inst->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
      inst->U.I.SrcReg[0].Index = tempreg;
      inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
   }

   c->Program.OutputsWritten |= 1U << dup_output;
}

/**
 * Introduce standard code fragment to deal with fragment.position.
 */
void
rc_transform_fragment_wpos(struct radeon_compiler *c, unsigned wpos, unsigned new_input,
                           int full_vtransform)
{
   struct rc_instruction *inst_rcp, *inst_mul, *inst_mad, *inst_mov, *inst;

   c->Program.InputsRead &= ~(1U << wpos);
   c->Program.InputsRead |= 1U << new_input;

   /* Figure out what channels we actually need. */
   unsigned usemask = 0;
   for (inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
      const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
      unsigned i;

      for (i = 0; i < opcode->NumSrcRegs; i++) {
         if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == wpos)
            usemask |= rc_swizzle_to_writemask(inst->U.I.SrcReg[i].Swizzle);
      }
   }

   /* perspective divide */
   inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
   inst_rcp->U.I.Opcode = RC_OPCODE_RCP;
   /* Make sure there is no temp reusing, some later passes will depend on the SSA-like form. */
   unsigned temp_reg_rcp = rc_find_free_temporary(c);

   inst_rcp->U.I.DstReg.File = RC_FILE_TEMPORARY;
   inst_rcp->U.I.DstReg.Index = temp_reg_rcp;
   inst_rcp->U.I.DstReg.WriteMask = RC_MASK_W;

   inst_rcp->U.I.SrcReg[0].File = RC_FILE_INPUT;
   inst_rcp->U.I.SrcReg[0].Index = new_input;
   inst_rcp->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;

   inst_mul = rc_insert_new_instruction(c, inst_rcp);
   inst_mul->U.I.Opcode = RC_OPCODE_MUL;
   unsigned temp_reg_mul = rc_find_free_temporary(c);

   inst_mul->U.I.DstReg.File = RC_FILE_TEMPORARY;
   inst_mul->U.I.DstReg.Index = temp_reg_mul;
   inst_mul->U.I.DstReg.WriteMask = RC_MASK_XYZ;

   inst_mul->U.I.SrcReg[0].File = RC_FILE_INPUT;
   inst_mul->U.I.SrcReg[0].Index = new_input;

   inst_mul->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
   inst_mul->U.I.SrcReg[1].Index = temp_reg_rcp;
   inst_mul->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_WWWW;

   /* viewport transformation */
   inst_mad = rc_insert_new_instruction(c, inst_mul);
   inst_mad->U.I.Opcode = RC_OPCODE_MAD;
   unsigned temp_reg_mad = rc_find_free_temporary(c);

   inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
   inst_mad->U.I.DstReg.Index = temp_reg_mad;
   inst_mad->U.I.DstReg.WriteMask = RC_MASK_XYZ;

   inst_mad->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
   inst_mad->U.I.SrcReg[0].Index = temp_reg_mul;
   inst_mad->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZ0;

   inst_mad->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
   inst_mad->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XYZ0;

   inst_mad->U.I.SrcReg[2].File = RC_FILE_CONSTANT;
   inst_mad->U.I.SrcReg[2].Swizzle = RC_SWIZZLE_XYZ0;

   if (usemask & RC_MASK_W) {
      inst_mov = rc_insert_new_instruction(c, inst_mad);
      inst_mov->U.I.Opcode = RC_OPCODE_MOV;

      inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
      inst_mov->U.I.DstReg.Index = temp_reg_mad;
      inst_mov->U.I.DstReg.WriteMask = RC_MASK_W;

      inst_mov->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
      inst_mov->U.I.SrcReg[0].Index = temp_reg_rcp;
      inst_mov->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_WWWW;
   }

   if (full_vtransform) {
      inst_mad->U.I.SrcReg[1].Index =
         rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_SCALE, 0);
      inst_mad->U.I.SrcReg[2].Index =
         rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_VIEWPORT_OFFSET, 0);
   } else {
      inst_mad->U.I.SrcReg[1].Index = inst_mad->U.I.SrcReg[2].Index =
         rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
   }

   for (inst = inst_mad->Next; inst != &c->Program.Instructions; inst = inst->Next) {
      const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
      unsigned i;

      for (i = 0; i < opcode->NumSrcRegs; i++) {
         if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == wpos) {
            inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
            inst->U.I.SrcReg[i].Index = temp_reg_mad;
         }
      }
   }
}

/**
 * The FACE input in hardware contains 1 if it's a back face, 0 otherwise.
 * Gallium and OpenGL define it the other way around.
 *
 * So let's just negate FACE at the beginning of the shader and rewrite the rest
 * of the shader to read from the newly allocated temporary.
 */
void
rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
{
   unsigned tempregi = rc_find_free_temporary(c);
   struct rc_instruction *inst_add;
   struct rc_instruction *inst;

   /* perspective divide */
   inst_add = rc_insert_new_instruction(c, &c->Program.Instructions);
   inst_add->U.I.Opcode = RC_OPCODE_ADD;

   inst_add->U.I.DstReg.File = RC_FILE_TEMPORARY;
   inst_add->U.I.DstReg.Index = tempregi;
   inst_add->U.I.DstReg.WriteMask = RC_MASK_X;

   inst_add->U.I.SrcReg[0].File = RC_FILE_NONE;
   inst_add->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;

   inst_add->U.I.SrcReg[1].File = RC_FILE_INPUT;
   inst_add->U.I.SrcReg[1].Index = face;
   inst_add->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XXXX;
   inst_add->U.I.SrcReg[1].Negate = RC_MASK_XYZW;

   for (inst = inst_add->Next; inst != &c->Program.Instructions; inst = inst->Next) {
      const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
      unsigned i;

      for (i = 0; i < opcode->NumSrcRegs; i++) {
         if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT && inst->U.I.SrcReg[i].Index == face) {
            inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
            inst->U.I.SrcReg[i].Index = tempregi;
         }
      }
   }
}

static void
reg_count_callback(void *userdata, struct rc_instruction *inst, rc_register_file file,
                   unsigned int index, unsigned int mask)
{
   struct rc_program_stats *s = userdata;
   if (file == RC_FILE_TEMPORARY)
      (int)index > s->num_temp_regs ? s->num_temp_regs = index : 0;
   if (file == RC_FILE_INLINE)
      s->num_inline_literals++;
   if (file == RC_FILE_CONSTANT)
      s->num_consts = MAX2(s->num_consts, index + 1);
}

void
rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
{
   struct rc_instruction *tmp;
   memset(s, 0, sizeof(*s));
   unsigned ip = 0;
   int last_begintex = -1;

   for (tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions;
        tmp = tmp->Next, ip++) {
      const struct rc_opcode_info *info;
      rc_for_all_reads_mask(tmp, reg_count_callback, s);
      if (tmp->Type == RC_INSTRUCTION_NORMAL) {
         info = rc_get_opcode_info(tmp->U.I.Opcode);
         if (info->Opcode == RC_OPCODE_BEGIN_TEX) {
            /* The R5xx docs mention ~30 cycles in section 8.3.1
             * The only case when we don't want to add the cycles
             * penalty is when the texblock contains only kil.
             */
            const struct rc_opcode_info *next_op = rc_get_opcode_info(tmp->Next->U.I.Opcode);
            struct rc_instruction *second_next_instr = tmp->Next->Next;
            const struct rc_opcode_info *second_next_op;
            if (second_next_instr->Type == RC_INSTRUCTION_NORMAL) {
               second_next_op = rc_get_opcode_info(second_next_instr->U.I.Opcode);
            } else {
               second_next_op = rc_get_opcode_info(second_next_instr->U.P.RGB.Opcode);
            }
            if (next_op->Opcode != RC_OPCODE_KIL ||
                (second_next_instr->Type == RC_INSTRUCTION_NORMAL && second_next_op->HasTexture)) {
               s->num_cycles += 30;
               last_begintex = ip;
            }
            continue;
         }
         if (info->Opcode == RC_OPCODE_MAD && rc_inst_has_three_diff_temp_srcs(tmp))
            s->num_cycles++;
      } else {
         if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used)
            s->num_presub_ops++;
         if (tmp->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)
            s->num_presub_ops++;
         /* Assuming alpha will never be a flow control or
          * a tex instruction. */
         if (tmp->U.P.Alpha.Opcode != RC_OPCODE_NOP)
            s->num_alpha_insts++;
         if (tmp->U.P.RGB.Opcode != RC_OPCODE_NOP)
            s->num_rgb_insts++;
         if (tmp->U.P.RGB.Omod != RC_OMOD_MUL_1 && tmp->U.P.RGB.Omod != RC_OMOD_DISABLE) {
            s->num_omod_ops++;
         }
         if (tmp->U.P.Alpha.Omod != RC_OMOD_MUL_1 && tmp->U.P.Alpha.Omod != RC_OMOD_DISABLE) {
            s->num_omod_ops++;
         }
         if (tmp->U.P.Nop)
            s->num_cycles++;
         /* SemWait has effect only on R500, the more instructions we can put
          * between the tex block and the first texture semaphore, the better.
          */
         if (tmp->U.P.SemWait && c->is_r500 && last_begintex != -1) {
            s->num_cycles -= MIN2(30, ip - last_begintex);
            last_begintex = -1;
         }
         info = rc_get_opcode_info(tmp->U.P.RGB.Opcode);
      }
      if (info->IsFlowControl) {
         s->num_fc_insts++;
         if (info->Opcode == RC_OPCODE_BGNLOOP)
            s->num_loops++;
      }
      /* VS flow control was already translated to the predicate instructions */
      if (c->type == RC_VERTEX_PROGRAM)
         if (strstr(info->Name, "PRED") != NULL)
            s->num_pred_insts++;

      if (info->HasTexture)
         s->num_tex_insts++;
      s->num_insts++;
      s->num_cycles++;
   }
   /* Increment here because the reg_count_callback store the max
    * temporary reg index in s->nun_temp_regs. */
   s->num_temp_regs++;
}

static void
print_stats(struct radeon_compiler *c)
{
   struct rc_program_stats s;

   rc_get_stats(c, &s);

   /* Note that we print some dummy values for instruction categories that
    * only the FS has, because shader-db's report.py wants all shaders to
    * have the same set.
    */
   util_debug_message(
      c->debug, SHADER_INFO,
      "%s shader: %u inst, %u vinst, %u sinst, %u predicate, %u flowcontrol, "
      "%u loops, %u tex, %u presub, %u omod, %u temps, %u consts, %u lits, %u cycles",
      c->type == RC_VERTEX_PROGRAM ? "VS" : "FS", s.num_insts, s.num_rgb_insts, s.num_alpha_insts,
      s.num_pred_insts, s.num_fc_insts, s.num_loops, s.num_tex_insts, s.num_presub_ops,
      s.num_omod_ops, s.num_temp_regs, s.num_consts, s.num_inline_literals, s.num_cycles);
}

static const char *shader_name[RC_NUM_PROGRAM_TYPES] = {"Vertex Program", "Fragment Program"};

bool
rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list)
{
   for (unsigned i = 0; list[i].name; i++) {
      if (list[i].predicate) {
         list[i].run(c, list[i].user);

         if (c->Error)
            return false;

         if ((c->Debug & RC_DBG_LOG) && list[i].dump) {
            fprintf(stderr, "%s: after '%s'\n", shader_name[c->type], list[i].name);
            rc_print_program(&c->Program);
         }
      }
   }
   return true;
}

/* Executes a list of compiler passes given in the parameter 'list'. */
void
rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list)
{
   if (c->Debug & RC_DBG_LOG) {
      fprintf(stderr, "%s: before compilation\n", shader_name[c->type]);
      rc_print_program(&c->Program);
   }

   if (rc_run_compiler_passes(c, list)) {
      print_stats(c);
   }
}

void
rc_validate_final_shader(struct radeon_compiler *c, void *user)
{
   /* Check the number of constants. */
   if (c->Program.Constants.Count > c->max_constants) {
      rc_error(c, "Too many constants. Max: %i, Got: %i\n", c->max_constants,
               c->Program.Constants.Count);
   }
}
